The goal of this challenge is to identify which simulated ATLAS experiment proton-proton collision observations contain evidence of boson pairs (“signal”), and which do not (“background”).
The challenge is hosted by Kaggle and provided by The Atlas Experiment. Technical details are found in the Laboratoire de l’Accélerateur Linéaire technical documentation PDF
Provided were a training set of 250,000 observations and a test set of 500,000 observations. Unusable or unknown quantities are reported as -999.0.
The variables include 17 “primitive” quantities and 13 “derived” quantities. Also included is the EventId identifying the observation. The training set also includes the goal label (b or s) and a “Weight” metric not to be used in classification. For this reason I've stripped it out of the training set and saved a new CSV as processed_train.csv
train <- read.csv("data/processed/processed_train.csv")
names(train)
## [1] "EventId" "DER_mass_MMC"
## [3] "DER_mass_transverse_met_lep" "DER_mass_vis"
## [5] "DER_pt_h" "DER_deltaeta_jet_jet"
## [7] "DER_mass_jet_jet" "DER_prodeta_jet_jet"
## [9] "DER_deltar_tau_lep" "DER_pt_tot"
## [11] "DER_sum_pt" "DER_pt_ratio_lep_tau"
## [13] "DER_met_phi_centrality" "DER_lep_eta_centrality"
## [15] "PRI_tau_pt" "PRI_tau_eta"
## [17] "PRI_tau_phi" "PRI_lep_pt"
## [19] "PRI_lep_eta" "PRI_lep_phi"
## [21] "PRI_met" "PRI_met_phi"
## [23] "PRI_met_sumet" "PRI_jet_num"
## [25] "PRI_jet_leading_pt" "PRI_jet_leading_eta"
## [27] "PRI_jet_leading_phi" "PRI_jet_subleading_pt"
## [29] "PRI_jet_subleading_eta" "PRI_jet_subleading_phi"
## [31] "PRI_jet_all_pt" "Label"
signal <- which(train$Label == "s")
require(ggplot2)
## Loading required package: ggplot2
create_plot <- function(data, v, yintercept = c()) {
p <- ggplot(data, aes(x = EventId, y = data[, v], color = Label), environment = environment())
l <- p + labs(y = v)
if (length(yintercept) == 0) {
print(l + geom_point())
} else {
print(l + geom_point() + geom_hline(yintercept = yintercept))
}
}
PRI_tau_pt
The transverse momentum \( \sqrt{p^2_x+p^2_y} \), or the momentum perpendicular to the beam line, of the hadronic tau measured in GeV
create_plot(train, "PRI_tau_pt")
ptp <- train$PRI_tau_pt
summary(ptp)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 20.0 24.6 31.8 38.7 45.0 764.0
Most observations are < 175 GeV, let's replot on that subset
ptp.175 <- subset(train, PRI_tau_pt < 175)
create_plot(ptp.175, "PRI_tau_pt")
It looks like the slower the transverse momentum, the more likely the observation is background noise. Let's replot for < 50
ptp.50 <- subset(train, PRI_tau_pt < 50)
create_plot(ptp.50, "PRI_tau_pt")
PRI_tau_eta
The pseudorapidity η (the angle from the beam axis) of the hadronic tau.
create_plot(train, "PRI_tau_eta")
It looks like a higher quantity of signal observations exist within -1.5 to 1.5 than outside this range.
signal.obs <- subset(train, Label == "s", select = PRI_tau_eta)
# quantity signal in range -1.5 to 1.5
in.range <- subset(signal.obs, PRI_tau_eta >= -1.5 & PRI_tau_eta <= 1.5)
nrow(in.range)/nrow(signal.obs)
## [1] 0.7974
Almost 80% of signal observations fall in the -1.5 to 1.5 range. Is it also more likely that an observation outside of this range is background?
outside.range <- subset(train, PRI_tau_eta < -1.5 | PRI_tau_eta > 1.5, select = Label)
table(outside.range)
## outside.range
## b s
## 49179 17355
length(which(outside.range == "b"))/nrow(outside.range)
## [1] 0.7392
About 74% of observations outside the range -1.5 to 1.5 are background noise.
PRI_tau_phi
The azimuthal angle φ, from -pi to pi in the spherical coordinate system, of the hadronic tau around the beam axis.
create_plot(train, "PRI_tau_phi")
# all observations
summary(train$PRI_tau_phi)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.1400 -1.5800 -0.0330 -0.0082 1.5600 3.1400
# just the signal
summary(train[signal, "PRI_tau_phi"])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.1400 -1.5800 -0.0590 -0.0192 1.5400 3.1400
# just the background
summary(train[-signal, "PRI_tau_phi"])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.1400 -1.5700 -0.0170 -0.0024 1.5800 3.1400
PRI_lep_pt
The transverse momentum \( \sqrt{p^2_x+p^2_y} \), or the momentum perpendicular to the beam line, of the lepton (electron or muon) measured in GeV
create_plot(train, "PRI_lep_pt")
Replotting to < 200
plp.200 <- subset(train, PRI_lep_pt < 200)
create_plot(plp.200, "PRI_lep_pt")
summary(train$PRI_lep_pt)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 26.0 32.4 40.5 46.7 53.4 560.0
PRI_lep_eta
The pseudorapidity η (the angle from the beam axis) of the lepton.
create_plot(train, "PRI_lep_eta")
Just as with the pseudorapidity of the hadronic tau it appears the signal clusters in the range of -1.5 to 1.5
signal.obs <- subset(train, Label == "s", select = PRI_lep_eta)
# quantity signal in range -1.5 to 1.5
in.range <- subset(signal.obs, PRI_lep_eta >= -1.5 & PRI_lep_eta <= 1.5)
nrow(in.range)/nrow(signal.obs)
## [1] 0.7837
Aabout 78% of signal observations fall in the -1.5 to 1.5 range.
outside.range <- subset(train, PRI_lep_eta < -1.5 | PRI_lep_eta > 1.5, select = Label)
table(outside.range)
## outside.range
## b s
## 56096 18534
length(which(outside.range == "b"))/nrow(outside.range)
## [1] 0.7517
About 75% of observations outside the range -1.5 to 1.5 are background noise.
PRI_lep_phi
The azimuthal angle φ, from -pi to pi in the spherical coordinate system, of the lepton around the beam axis.
create_plot(train, "PRI_lep_phi")
# all observations
summary(train$PRI_lep_phi)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.1400 -1.5200 0.0860 0.0435 1.6200 3.1400
# just the signal
summary(train[signal, "PRI_lep_phi"])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.1400 -1.5000 0.1030 0.0539 1.6300 3.1400
# just the background
summary(train[-signal, "PRI_lep_phi"])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.1400 -1.5300 0.0760 0.0381 1.6100 3.1400
PRI_met The missing transverse energy \( \overrightarrow{E}^{miss}_T \)
create_plot(train, "PRI_met")
summary(train$PRI_met)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1 21.4 34.8 41.7 51.9 2840.0
# just the signal
summary(train[signal, "PRI_met"])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.2 17.9 31.9 42.7 54.5 2840.0
# just the background
summary(train[-signal, "PRI_met"])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1 23.3 35.9 41.2 51.0 951.0
That one outlier really makes this blurry…
pm.500 <- subset(train, PRI_met < 500)
create_plot(pm.500, "PRI_met")
pm.200 <- subset(train, PRI_met < 200)
create_plot(pm.200, "PRI_met")
pm.100 <- subset(train, PRI_met < 100)
create_plot(pm.100, "PRI_met")
There appears to be a lot of signal clustered < 25
under.25 <- subset(train[signal, ], PRI_met < 25)
nrow(under.25)/length(signal)
## [1] 0.3842
About 38% of the signal is < 25
PRI_met_phi
The azimuth angle φ of the missing transverse energy.
create_plot(train, "PRI_met_phi")
summary(train$PRI_met_phi)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.1400 -1.5800 -0.0240 -0.0101 1.5600 3.1400
# just the signal
summary(train[signal, "PRI_met_phi"])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.1400 -1.5500 0.0190 0.0086 1.5800 3.1400
# just the background
summary(train[-signal, "PRI_met_phi"])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.1400 -1.5800 -0.0440 -0.0199 1.5500 3.1400
PRI_met_sumet
The total transverse energy in the detector.
create_plot(train, "PRI_met_sumet")
summary(train$PRI_met_sumet)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 13.7 123.0 180.0 210.0 263.0 2000.0
# just the signal
summary(train[signal, "PRI_met_sumet"])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 13.7 146.0 208.0 234.0 291.0 1390.0
# just the background
summary(train[-signal, "PRI_met_sumet"])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 18.5 114.0 166.0 197.0 246.0 2000.0
pms.750 <- subset(train, PRI_met_sumet < 750)
create_plot(pms.750, "PRI_met_sumet")
pms.300 <- subset(train, PRI_met_sumet < 300)
create_plot(pms.300, "PRI_met_sumet")
pms.200 <- subset(train, PRI_met_sumet < 200)
create_plot(pms.200, "PRI_met_sumet")
PRI_jet_num
The number of jets (a narrow cone of hadrons and other particles produced by the hadronization of a quark or gluon) http://en.wikipedia.org/wiki/Jet_(particle_physics)
ggplot(data = train, aes(x = PRI_jet_num, fill = Label)) + geom_bar(position = position_dodge())
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
PRI_jet_leading_pt The transverse momentum \( \sqrt{p^2_x+p^2_y} \) of the leading jet, that is the jet with largest transverse momentum (undefined if PRI jet num = 0).
create_plot(train, "PRI_jet_leading_pt")
pjlp.na <- subset(train, PRI_jet_leading_pt == -999, select = Label)
table(pjlp.na)
## pjlp.na
## b s
## 74421 25492
length(which(pjlp.na == "b"))/nrow(pjlp.na)
## [1] 0.7449
75% of undefined are background.
pjlp <- subset(train, PRI_jet_leading_pt != -999)
create_plot(pjlp, "PRI_jet_leading_pt")
summary(pjlp$PRI_jet_leading_pt)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 30.0 44.4 65.6 84.8 103.0 1120.0
pjlp.signal <- which(pjlp$Label == "s")
# just the signal
summary(pjlp[pjlp.signal, "PRI_jet_leading_pt"])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 30.0 50.1 74.4 92.9 114.0 738.0
# just the background
summary(pjlp[-pjlp.signal, "PRI_jet_leading_pt"])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 30.0 41.7 60.1 79.4 95.2 1120.0
PRI_jet_leading_eta
The pseudorapidity η of the leading jet (undefined if PRI jet num = 0).
create_plot(train, "PRI_jet_leading_eta")
pjle.na <- subset(train, PRI_jet_leading_eta == -999, select = Label)
table(pjle.na)
## pjle.na
## b s
## 74421 25492
length(which(pjle.na == "b"))/nrow(pjle.na)
## [1] 0.7449
74% of undefined are background.
pjle <- subset(train, PRI_jet_leading_eta != -999)
create_plot(pjle, "PRI_jet_leading_eta", c(-1.5, 1.5))
## Warning: the condition has length > 1 and only the first element will be
## used
pjle.inrange <- which(pjle$PRI_jet_leading_eta >= -1.5 & pjle$PRI_jet_leading_eta <=
1.5)
table(pjle[pjle.inrange, "Label"])
##
## b s
## 55373 27680
nrow(subset(pjle[pjle.inrange, ], Label == "b"))/length(pjle.inrange)
## [1] 0.6667
67% of defined observations in the -1.5 to 1.5 range are background
What percentange of defined observations outside the -1.5 to 1.5 range are signal?
out.range <- subset(pjle, PRI_jet_leading_eta < -1.5 | PRI_jet_leading_eta >
1.5, select = "Label")
table(out.range)
## out.range
## b s
## 34539 32495
summary(pjle$PRI_jet_leading_eta)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -4.500 -1.340 0.000 -0.003 1.340 4.500
pjle.signal <- which(pjle$Label == "s")
# just the signal
summary(pjle[pjle.signal, "PRI_jet_leading_eta"])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -4.500 -1.630 -0.005 -0.003 1.620 4.500
# just the background
summary(pjle[-pjle.signal, "PRI_jet_leading_eta"])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -4.500 -1.170 0.002 -0.004 1.170 4.490
PRI_jet_leading_phi
The azimuth angle φ of the leading jet (undefined if PRI jet num = 0).
create_plot(train, "PRI_jet_leading_phi")
pjlphi.na <- subset(train, PRI_jet_leading_phi == -999, select = Label)
table(pjlphi.na)
## pjlphi.na
## b s
## 74421 25492
length(which(pjlphi.na == "b"))/nrow(pjlphi.na)
## [1] 0.7449
75% of undefined are background
pjlphi <- subset(train, PRI_jet_leading_phi != -999)
create_plot(pjlphi, "PRI_jet_leading_phi")
summary(pjlphi$PRI_jet_leading_phi)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.1400 -1.5800 -0.0330 -0.0124 1.5600 3.1400
pjlphi.signal <- which(pjlphi$Label == "s")
# just the signal
summary(pjlphi[pjlphi.signal, "PRI_jet_leading_phi"])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.1400 -1.5900 -0.0360 -0.0129 1.5700 3.1400
# just the background
summary(pjlphi[-pjlphi.signal, "PRI_jet_leading_phi"])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.1400 -1.5800 -0.0310 -0.0121 1.5600 3.1400
PRI_jet_subleading_pt
The transverse momentum \( \sqrt{p^2_x+p^2_y} \) of the second leading jet. (undefined if PRI jet num ≤ 1).
create_plot(train, "PRI_jet_subleading_pt")
pjsp.na <- subset(train, PRI_jet_subleading_pt == -999, select = Label)
table(pjsp.na)
## pjsp.na
## b s
## 124255 53202
length(which(pjsp.na == "b"))/nrow(pjsp.na)
## [1] 0.7002
70% of undefined are background
pjsp <- subset(train, PRI_jet_subleading_pt != -999)
create_plot(pjsp, "PRI_jet_subleading_pt")
summary(pjsp$PRI_jet_subleading_pt)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 30.0 37.3 47.9 57.7 66.6 721.0
pjsp.signal <- which(pjsp$Label == "s")
# just the signal
summary(pjsp[pjsp.signal, "PRI_jet_subleading_pt"])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 30.0 37.8 48.6 56.9 66.8 422.0
# just the background
summary(pjsp[-pjsp.signal, "PRI_jet_subleading_pt"])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 30.0 37.0 47.4 58.3 66.5 721.0
PRI_jet_subleading_phi
The azimuth angle φ of the subleading jet (undefined if PRI jet num ≤ 1).
create_plot(train, "PRI_jet_subleading_phi")
pjsphi.na <- subset(train, PRI_jet_subleading_phi == -999, select = Label)
table(pjsphi.na)
## pjsphi.na
## b s
## 124255 53202
length(which(pjsphi.na == "b"))/nrow(pjsphi.na)
## [1] 0.7002
70% of undefined are background
pjsphi <- subset(train, PRI_jet_subleading_phi != -999)
create_plot(pjsphi, "PRI_jet_subleading_phi")
summary(pjsphi$PRI_jet_subleading_phi)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.1400 -1.5800 -0.0020 -0.0016 1.5800 3.1400
pjsphi.signal <- which(pjsphi$Label == "s")
# just the signal
summary(pjsphi[pjsphi.signal, "PRI_jet_subleading_phi"])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.1400 -1.5900 -0.0240 -0.0143 1.5600 3.1400
# just the background
summary(pjsphi[-pjsphi.signal, "PRI_jet_subleading_phi"])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.1400 -1.5700 0.0160 0.0087 1.5900 3.1400
PRI_jet_all_pt
The scalar sum of the transverse momentum of all the jets.
create_plot(train, "PRI_jet_all_pt")
pjap.500 <- subset(train, PRI_jet_all_pt < 500)
create_plot(pjap.500, "PRI_jet_all_pt")
pjap.300 <- subset(train, PRI_jet_all_pt < 300)
create_plot(pjap.300, "PRI_jet_all_pt")
summary(train$PRI_jet_all_pt)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 0.0 40.5 73.1 110.0 1630.0
# just the signal
summary(train[signal, "PRI_jet_all_pt"])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 0.0 68.0 91.3 139.0 1190.0
# just the background
summary(train[-signal, "PRI_jet_all_pt"])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 0.0 33.6 63.6 89.8 1630.0
DER_mass_MMC
The estimated mass of the Higgs boson candidate. May be undefined.
create_plot(train, "DER_mass_MMC")
dmm.na <- subset(train, DER_mass_MMC == -999, select = Label)
table(dmm.na)
## dmm.na
## b s
## 35279 2835
length(which(dmm.na == "b"))/nrow(dmm.na)
## [1] 0.9256
93% of undefined are background
dmm <- subset(train, DER_mass_MMC != -999)
create_plot(dmm, "DER_mass_MMC")
dmm.250 <- subset(dmm, DER_mass_MMC < 250)
create_plot(dmm.250, "DER_mass_MMC", c(75, 175))
dmm.signal <- subset(dmm, Label == "s")
dmm.signal.inrange <- subset(dmm.signal, DER_mass_MMC >= 75 & DER_mass_MMC <=
175)
nrow(dmm.signal.inrange)/nrow(dmm.signal)
## [1] 0.9702
97% of defined signal observations lie in range 75 to 175
DER_mass_transverse_met_lep
The transverse mass between the missing transverse energy and the lepton.
create_plot(train, "DER_mass_transverse_met_lep")
dmtml.200 <- subset(train, DER_mass_transverse_met_lep < 200)
create_plot(dmtml.200, "DER_mass_transverse_met_lep")
dmtml.150 <- subset(train, DER_mass_transverse_met_lep < 150)
create_plot(dmtml.150, "DER_mass_transverse_met_lep", 50)